# load libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(data.table)
## 
## Attaching package: 'data.table'
## 
## The following objects are masked from 'package:lubridate':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## 
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## 
## The following object is masked from 'package:purrr':
## 
##     transpose
library(ggthemes)
library(viridis)
## Loading required package: viridisLite
library(scales)
## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:viridis':
## 
##     viridis_pal
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor
library(wrapr)
## 
## Attaching package: 'wrapr'
## 
## The following objects are masked from 'package:data.table':
## 
##     :=, let
## 
## The following object is masked from 'package:dplyr':
## 
##     coalesce
## 
## The following objects are masked from 'package:tidyr':
## 
##     pack, unpack
## 
## The following object is masked from 'package:tibble':
## 
##     view
library(tidyplots)

# load tables
gnomAD_main_table_full_cCREs <- readRDS("../../results/gnomAD_selection_preprocess/gnomAD_main_table_full_cCREs.rds")
gnomAD_main_table_full_cCREs_in_TF <- readRDS("../../results/gnomAD_selection_preprocess/gnomAD_main_table_full_cCREs_in_TF.rds")
gnomAD_main_table_full_cCREs_in_rep <- readRDS("../../results/gnomAD_selection_preprocess/gnomAD_main_table_full_cCREs_in_rep.rds")
gnomAD_main_table_full_emVar_cat <- readRDS("../../results/gnomAD_selection_preprocess/gnomAD_main_table_full_emVar_cat.rds")
gnomAD_main_table_full_cCREs_emVar_cat <- readRDS("../../results/gnomAD_selection_preprocess/gnomAD_main_table_full_cCREs_emVar_cat.rds")
gnomAD_main_table_full_cCREs_pleio <- readRDS("../../results/gnomAD_selection_preprocess/gnomAD_main_table_full_cCREs_pleio.rds")
gnomAD_main_table_full_cCREs_pleio_mean <- readRDS("../../results/gnomAD_selection_preprocess/gnomAD_main_table_full_cCREs_pleio_mean.rds")
gnomAD_main_table_full_CADD <- readRDS("../../results/gnomAD_selection_preprocess/gnomAD_main_table_full_CADD.rds")
gnomAD_main_table_full_cCREs_CADD <- readRDS("../../results/gnomAD_selection_preprocess/gnomAD_main_table_full_cCREs_CADD.rds")
gnomAD_main_table_full_cCREs_mean <- readRDS("../../results/gnomAD_selection_preprocess/gnomAD_main_table_full_cCREs_mean.rds")
gnomAD_main_table_full_cCREs_mean_in_TF <- readRDS("../../results/gnomAD_selection_preprocess/gnomAD_main_table_full_cCREs_mean_in_TF.rds")
gnomAD_main_table_full_cCREs_mean_in_rep <- readRDS("../../results/gnomAD_selection_preprocess/gnomAD_main_table_full_cCREs_mean_in_rep.rds")
gnomAD_main_table_full_cCREs_K562 <- readRDS("../../results/gnomAD_selection_preprocess/gnomAD_main_table_full_cCREs_K562.rds")
gnomAD_main_table_full_cCREs_K562_in_TF <- readRDS("../../results/gnomAD_selection_preprocess/gnomAD_main_table_full_cCREs_K562_in_TF.rds")
gnomAD_main_table_full_cCREs_K562_in_rep <- readRDS("../../results/gnomAD_selection_preprocess/gnomAD_main_table_full_cCREs_K562_in_rep.rds")
gnomAD_main_table_full_cCREs_HepG2 <- readRDS("../../results/gnomAD_selection_preprocess/gnomAD_main_table_full_cCREs_HepG2.rds")
gnomAD_main_table_full_cCREs_HepG2_in_TF <- readRDS("../../results/gnomAD_selection_preprocess/gnomAD_main_table_full_cCREs_HepG2_in_TF.rds")
gnomAD_main_table_full_cCREs_HepG2_in_rep <- readRDS("../../results/gnomAD_selection_preprocess/gnomAD_main_table_full_cCREs_HepG2_in_rep.rds")
gnomAD_main_table_full_cCREs_SKNSH <- readRDS("../../results/gnomAD_selection_preprocess/gnomAD_main_table_full_cCREs_SKNSH.rds")
gnomAD_main_table_full_cCREs_SKNSH_in_TF <- readRDS("../../results/gnomAD_selection_preprocess/gnomAD_main_table_full_cCREs_SKNSH_in_TF.rds")
gnomAD_main_table_full_cCREs_SKNSH_in_rep <- readRDS("../../results/gnomAD_selection_preprocess/gnomAD_main_table_full_cCREs_SKNSH_in_rep.rds")
gnomAD_main_table_full_cCREs_mean_ref <- readRDS("../../results/gnomAD_selection_preprocess/gnomAD_main_table_full_cCREs_mean_ref.rds")
gnomAD_main_table_full_cCREs_K562_ref <- readRDS("../../results/gnomAD_selection_preprocess/gnomAD_main_table_full_cCREs_K562_ref.rds")
gnomAD_main_table_full_cCREs_HepG2_ref <- readRDS("../../results/gnomAD_selection_preprocess/gnomAD_main_table_full_cCREs_HepG2_ref.rds")
gnomAD_main_table_full_cCREs_SKNSH_ref <- readRDS("../../results/gnomAD_selection_preprocess/gnomAD_main_table_full_cCREs_SKNSH_ref.rds")
gnomAD_main_table_full_in_TF_mean <- readRDS("../../results/gnomAD_selection_preprocess/gnomAD_main_table_full_in_TF_mean.rds")
gnomAD_main_table_full_in_TF_K562 <- readRDS("../../results/gnomAD_selection_preprocess/gnomAD_main_table_full_in_TF_K562.rds")
gnomAD_main_table_full_in_TF_HepG2 <- readRDS("../../results/gnomAD_selection_preprocess/gnomAD_main_table_full_in_TF_HepG2.rds")
gnomAD_main_table_full_in_TF_SKNSH <- readRDS("../../results/gnomAD_selection_preprocess/gnomAD_main_table_full_in_TF_SKNSH.rds")
gnomAD_main_table_full_in_TF_mean_ref <- readRDS("../../results/gnomAD_selection_preprocess/gnomAD_main_table_full_in_TF_mean_ref.rds")
gnomAD_main_table_full_in_TF_K562_ref <- readRDS("../../results/gnomAD_selection_preprocess/gnomAD_main_table_full_in_TF_K562_ref.rds")
gnomAD_main_table_full_in_TF_HepG2_ref <- readRDS("../../results/gnomAD_selection_preprocess/gnomAD_main_table_full_in_TF_HepG2_ref.rds")
gnomAD_main_table_full_in_TF_SKNSH_ref <- readRDS("../../results/gnomAD_selection_preprocess/gnomAD_main_table_full_in_TF_SKNSH_ref.rds")
gnomAD_main_table_full_in_rep_mean <- readRDS("../../results/gnomAD_selection_preprocess/gnomAD_main_table_full_in_rep_mean.rds")
gnomAD_main_table_full_in_rep_K562 <- readRDS("../../results/gnomAD_selection_preprocess/gnomAD_main_table_full_in_rep_K562.rds")
gnomAD_main_table_full_in_rep_HepG2 <- readRDS("../../results/gnomAD_selection_preprocess/gnomAD_main_table_full_in_rep_HepG2.rds")
gnomAD_main_table_full_in_rep_SKNSH <- readRDS("../../results/gnomAD_selection_preprocess/gnomAD_main_table_full_in_rep_SKNSH.rds")
gnomAD_main_table_full_in_rep_mean_ref <- readRDS("../../results/gnomAD_selection_preprocess/gnomAD_main_table_full_in_rep_mean_ref.rds")
gnomAD_main_table_full_in_rep_K562_ref <- readRDS("../../results/gnomAD_selection_preprocess/gnomAD_main_table_full_in_rep_K562_ref.rds")
gnomAD_main_table_full_in_rep_HepG2_ref <- readRDS("../../results/gnomAD_selection_preprocess/gnomAD_main_table_full_in_rep_HepG2_ref.rds")
gnomAD_main_table_full_in_rep_SKNSH_ref <- readRDS("../../results/gnomAD_selection_preprocess/gnomAD_main_table_full_in_rep_SKNSH_ref.rds")
gnomAD_main_table_full_cCREs_category <- readRDS("../../results/gnomAD_selection_preprocess/gnomAD_main_table_full_cCREs_category.rds")
gnomAD_main_table_full_cCREs_mean_category <- readRDS("../../results/gnomAD_selection_preprocess/gnomAD_main_table_full_cCREs_mean_category.rds")
gnomAD_main_table_full_cCREs_K562_category <- readRDS("../../results/gnomAD_selection_preprocess/gnomAD_main_table_full_cCREs_K562_category.rds")
gnomAD_main_table_full_cCREs_HepG2_category <- readRDS("../../results/gnomAD_selection_preprocess/gnomAD_main_table_full_cCREs_HepG2_category.rds")
gnomAD_main_table_full_cCREs_SKNSH_category <- readRDS("../../results/gnomAD_selection_preprocess/gnomAD_main_table_full_cCREs_SKNSH_category.rds")
gnomAD_main_table_full_cCREs_mean_mean <- readRDS("../../results/gnomAD_selection_preprocess/gnomAD_main_table_full_cCREs_mean_mean.rds")
gnomAD_main_table_full_cCREs_K562_K562 <- readRDS("../../results/gnomAD_selection_preprocess/gnomAD_main_table_full_cCREs_K562_K562.rds")
gnomAD_main_table_full_cCREs_HepG2_HepG2 <- readRDS("../../results/gnomAD_selection_preprocess/gnomAD_main_table_full_cCREs_HepG2_HepG2.rds")
gnomAD_main_table_full_cCREs_SKNSH_SKNSH <- readRDS("../../results/gnomAD_selection_preprocess/gnomAD_main_table_full_cCREs_SKNSH_SKNSH.rds")
gnomAD_main_table_full_cCREs_emVar_cat_mean <- readRDS("../../results/gnomAD_selection_preprocess/gnomAD_main_table_full_cCREs_emVar_cat_mean.rds")
gnomAD_main_table_full_cCREs_emVar_cat_K562 <- readRDS("../../results/gnomAD_selection_preprocess/gnomAD_main_table_full_cCREs_emVar_cat_K562.rds")
gnomAD_main_table_full_cCREs_emVar_cat_HepG2 <- readRDS("../../results/gnomAD_selection_preprocess/gnomAD_main_table_full_cCREs_emVar_cat_HepG2.rds")
gnomAD_main_table_full_cCREs_emVar_cat_SKNSH <- readRDS("../../results/gnomAD_selection_preprocess/gnomAD_main_table_full_cCREs_emVar_cat_SKNSH.rds")
gnomAD_main_table_full_cCREs_K562_HepG2 <- readRDS("../../results/gnomAD_selection_preprocess/gnomAD_main_table_full_cCREs_K562_HepG2.rds")
gnomAD_main_table_full_cCREs_K562_SKNSH <- readRDS("../../results/gnomAD_selection_preprocess/gnomAD_main_table_full_cCREs_K562_SKNSH.rds")
gnomAD_main_table_full_cCREs_HepG2_SKNSH <- readRDS("../../results/gnomAD_selection_preprocess/gnomAD_main_table_full_cCREs_HepG2_SKNSH.rds")
gnomAD_main_table <- readRDS("../../results/gnomAD_selection_preprocess/gnomAD_main_table.rds")

# how many emVars
gnomAD_main_table_summ_emVar <- gnomAD_main_table %>% 
    group_by(emVar_category) %>% 
    summarise(count = sum(count)) %>% 
    ungroup() %>% 
    mutate(prop = count/sum(count))

gnomAD_main_table_summ_emVar %>% filter(emVar_category != "none") %>% 
  tidyplot(x = emVar_category, y = count, color = emVar_category) %>% 
  add_barstack_absolute() %>% 
  remove_legend() %>% 
  adjust_x_axis(rotate_labels = TRUE) + 
  geom_text(aes(label = count), angle = 90, hjust = -0.1, size = 3) +
  xlab("emVar category") + 
  ylab("Count") + 
  ylim(c(0, 3500000))
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.

ggsave("../../results/gnomAD_selection_analysis/how_many_emVar.pdf")
## Saving 7 x 5 in image
## Warning: 'mode(bg)' differs between new and previous
##   ==> NOT changing 'bg'
# how many emVars per cCRE
gnomAD_main_table_summ_emVar_cCRE <- gnomAD_main_table %>% 
  mutate(emVar = ifelse(emVar_category == "none", FALSE, TRUE)) %>% 
    group_by(emVar, ENCODE_cCREs) %>% 
    summarise(count = sum(count)) %>% 
    ungroup() %>% 
    mutate(prop = count/sum(count))
## `summarise()` has grouped output by 'emVar'. You can override using the
## `.groups` argument.
gnomAD_main_table_summ_emVar_cCRE %>% 
  filter(ENCODE_cCREs != "Other cCREs", emVar) %>% 
  tidyplot(x = ENCODE_cCREs, y = count, color = emVar) %>% 
  add_mean_bar() %>% 
  adjust_x_axis(rotate_labels = TRUE) + 
  geom_text(aes(label = count), angle = 90, hjust = -0.1, size = 3, 
            position = position_dodge(.9), show.legend = FALSE) +
  xlab("ENCODE cCREs") + 
  ylab("Count")

ggsave("../../results/gnomAD_selection_analysis/how_many_emVar_cCRE.pdf")
## Saving 7 x 5 in image
## Warning: 'mode(bg)' differs between new and previous
##   ==> NOT changing 'bg'
# how many allelic skew
gnomAD_main_table_summ_K562_skew <- gnomAD_main_table %>% 
    group_by(K562_skew_pred_bin) %>% 
    summarise(count = sum(count)) %>% 
    ungroup() %>% 
    mutate(prop = count/sum(count))

gnomAD_main_table_summ_HepG2_skew <- gnomAD_main_table %>% 
    group_by(HepG2_skew_pred_bin) %>% 
    summarise(count = sum(count)) %>% 
    ungroup() %>% 
    mutate(prop = count/sum(count))

gnomAD_main_table_summ_SKNSH_skew <- gnomAD_main_table %>% 
    group_by(SKNSH_skew_pred_bin) %>% 
    summarise(count = sum(count)) %>% 
    ungroup() %>% 
    mutate(prop = count/sum(count))

gnomAD_main_table_summ_mean_skew <- gnomAD_main_table %>% 
    group_by(mean_skew_pred_bin) %>% 
    summarise(count = sum(count)) %>% 
    ungroup() %>% 
    mutate(prop = count/sum(count))

gnomAD_main_table_summ_K562_skew %>% 
  tidyplot(x = K562_skew_pred_bin, y = count) %>% 
  add_barstack_absolute(fill = "#00A79D") %>% 
  remove_legend() %>% 
  adjust_x_axis(rotate_labels = TRUE) + 
  geom_text(aes(label = count), angle = 90, hjust = -0.1, size = 3, color = "#00A79D") +
  xlab("K562 allelic skew") + 
  ylab("Count")

ggsave("../../results/gnomAD_selection_analysis/how_many_K562_skew.pdf")
## Saving 7 x 5 in image
## Warning: 'mode(bg)' differs between new and previous
##   ==> NOT changing 'bg'
gnomAD_main_table_summ_HepG2_skew %>% 
  tidyplot(x = HepG2_skew_pred_bin, y = count) %>% 
  add_barstack_absolute(fill = "#FBB040") %>% 
  remove_legend() %>% 
  adjust_x_axis(rotate_labels = TRUE) + 
  geom_text(aes(label = count), angle = 90, hjust = -0.1, size = 3, color = "#FBB040") +
  xlab("HepG2 allelic skew") + 
  ylab("Count")

ggsave("../../results/gnomAD_selection_analysis/how_many_HepG2_skew.pdf")
## Saving 7 x 5 in image
## Warning: 'mode(bg)' differs between new and previous
##   ==> NOT changing 'bg'
gnomAD_main_table_summ_SKNSH_skew %>% 
  tidyplot(x = SKNSH_skew_pred_bin, y = count) %>% 
  add_barstack_absolute(fill = "#ED1C24") %>% 
  remove_legend() %>% 
  adjust_x_axis(rotate_labels = TRUE) + 
  geom_text(aes(label = count), angle = 90, hjust = -0.1, size = 3, color = "#ED1C24") +
  xlab("SKNSH allelic skew") + 
  ylab("Count")

ggsave("../../results/gnomAD_selection_analysis/how_many_SKNSH_skew.pdf")
## Saving 7 x 5 in image
## Warning: 'mode(bg)' differs between new and previous
##   ==> NOT changing 'bg'
gnomAD_main_table_summ_mean_skew %>% 
  tidyplot(x = mean_skew_pred_bin, y = count) %>% 
  add_barstack_absolute() %>% 
  remove_legend() %>% 
  adjust_x_axis(rotate_labels = TRUE) + 
  geom_text(aes(label = count), angle = 90, hjust = -0.1, size = 3) +
  xlab("Mean allelic skew") + 
  ylab("Count")

ggsave("../../results/gnomAD_selection_analysis/how_many_mean_skew.pdf")
## Saving 7 x 5 in image
## Warning: 'mode(bg)' differs between new and previous
##   ==> NOT changing 'bg'
nonsig_skew <- c("[-0.5, -0.2)", "[-0.2, -0.05)", "[-0.05, 0.05)", "[0.05, 0.2)", "[0.2, 0.5)")
a <- sum(filter(gnomAD_main_table, (K562_skew_pred_bin %in% nonsig_skew) & (HepG2_skew_pred_bin %in% nonsig_skew) & (SKNSH_skew_pred_bin %in% nonsig_skew)
)$count)
b <- sum(filter(gnomAD_main_table, !((K562_skew_pred_bin %in% nonsig_skew) & (HepG2_skew_pred_bin %in% nonsig_skew) & (SKNSH_skew_pred_bin %in% nonsig_skew)
))$count)
a
## [1] 504507736
b
## [1] 9378521
b/(a+b)
## [1] 0.01825019
# how many ref active
gnomAD_main_table_summ_K562_ref <- gnomAD_main_table %>% 
    group_by(K562_ref_pred_bin) %>% 
    summarise(count = sum(count)) %>% 
    ungroup() %>% 
    mutate(prop = count/sum(count))

gnomAD_main_table_summ_HepG2_ref <- gnomAD_main_table %>% 
    group_by(HepG2_ref_pred_bin) %>% 
    summarise(count = sum(count)) %>% 
    ungroup() %>% 
    mutate(prop = count/sum(count))

gnomAD_main_table_summ_SKNSH_ref <- gnomAD_main_table %>% 
    group_by(SKNSH_ref_pred_bin) %>% 
    summarise(count = sum(count)) %>% 
    ungroup() %>% 
    mutate(prop = count/sum(count))

gnomAD_main_table_summ_mean_ref <- gnomAD_main_table %>% 
    group_by(mean_ref_pred_bin) %>% 
    summarise(count = sum(count)) %>% 
    ungroup() %>% 
    mutate(prop = count/sum(count))

gnomAD_main_table_summ_K562_ref %>% 
  tidyplot(x = K562_ref_pred_bin, y = count) %>% 
  add_barstack_absolute(fill = "#00A79D") %>% 
  remove_legend() %>% 
  adjust_x_axis(rotate_labels = TRUE) + 
  geom_text(aes(label = count), angle = 90, hjust = -0.1, size = 3, color = "#00A79D") +
  xlab("K562 ref activity") + 
  ylab("Count")

ggsave("../../results/gnomAD_selection_analysis/how_many_K562_ref.pdf")
## Saving 7 x 5 in image
## Warning: 'mode(bg)' differs between new and previous
##   ==> NOT changing 'bg'
gnomAD_main_table_summ_HepG2_ref %>% 
  tidyplot(x = HepG2_ref_pred_bin, y = count) %>% 
  add_barstack_absolute(fill = "#FBB040") %>% 
  remove_legend() %>% 
  adjust_x_axis(rotate_labels = TRUE) + 
  geom_text(aes(label = count), angle = 90, hjust = -0.1, size = 3, color = "#FBB040") +
  xlab("HepG2 ref activity") + 
  ylab("Count")

ggsave("../../results/gnomAD_selection_analysis/how_many_HepG2_ref.pdf")
## Saving 7 x 5 in image
## Warning: 'mode(bg)' differs between new and previous
##   ==> NOT changing 'bg'
gnomAD_main_table_summ_SKNSH_ref %>% 
  tidyplot(x = SKNSH_ref_pred_bin, y = count) %>% 
  add_barstack_absolute(fill = "#ED1C24") %>% 
  remove_legend() %>% 
  adjust_x_axis(rotate_labels = TRUE) + 
  geom_text(aes(label = count), angle = 90, hjust = -0.1, size = 3, color = "#ED1C24") +
  xlab("SKNSH ref activity") + 
  ylab("Count")

ggsave("../../results/gnomAD_selection_analysis/how_many_SKNSH_ref.pdf")
## Saving 7 x 5 in image
## Warning: 'mode(bg)' differs between new and previous
##   ==> NOT changing 'bg'
gnomAD_main_table_summ_mean_ref %>% 
  tidyplot(x = mean_ref_pred_bin, y = count) %>% 
  add_barstack_absolute() %>% 
  remove_legend() %>% 
  adjust_x_axis(rotate_labels = TRUE) + 
  geom_text(aes(label = count), angle = 90, hjust = -0.1, size = 3) +
  xlab("Mean ref activity") + 
  ylab("Count")

ggsave("../../results/gnomAD_selection_analysis/how_many_mean_ref.pdf")
## Saving 7 x 5 in image
## Warning: 'mode(bg)' differs between new and previous
##   ==> NOT changing 'bg'
nonsig_ref <- c("[-Inf, 1)")
a <- sum(filter(gnomAD_main_table, (K562_ref_pred_bin %in% nonsig_ref) & (HepG2_ref_pred_bin %in% nonsig_ref) & (SKNSH_ref_pred_bin %in% nonsig_ref)
)$count)
b <- sum(filter(gnomAD_main_table, !((K562_ref_pred_bin %in% nonsig_ref) & (HepG2_ref_pred_bin %in% nonsig_ref) & (SKNSH_ref_pred_bin %in% nonsig_ref)
))$count)
a
## [1] 462431266
b
## [1] 51454991
b/(a+b)
## [1] 0.1001291
emVar_bins <- c(" (-Inf, -1.5)", "[-1.5, -1.0)", "[-1.0, -0.5)", "[0.5, 1.0)", "[1.0, 1.5)", "[1.5, Inf)")

sum(gnomAD_main_table %>% filter(ENCODE_cCREs == "PLS") %>% filter((K562_skew_pred_bin %in% emVar_bins) | (HepG2_skew_pred_bin %in% emVar_bins) | (SKNSH_skew_pred_bin %in% emVar_bins)) %>% .$count)/
  sum(gnomAD_main_table %>% filter(ENCODE_cCREs == "PLS") %>% .$count)
## [1] 0.08825359
sum(gnomAD_main_table %>% filter(ENCODE_cCREs == "pELS") %>% filter((K562_skew_pred_bin %in% emVar_bins) | (HepG2_skew_pred_bin %in% emVar_bins) | (SKNSH_skew_pred_bin %in% emVar_bins)) %>% .$count)/
  sum(gnomAD_main_table %>% filter(ENCODE_cCREs == "pELS") %>% .$count)
## [1] 0.04156389
sum(gnomAD_main_table %>% filter(ENCODE_cCREs == "dELS") %>% filter((K562_skew_pred_bin %in% emVar_bins) | (HepG2_skew_pred_bin %in% emVar_bins) | (SKNSH_skew_pred_bin %in% emVar_bins)) %>% .$count)/
  sum(gnomAD_main_table %>% filter(ENCODE_cCREs == "dELS") %>% .$count)
## [1] 0.03621046
sum(gnomAD_main_table %>% filter(ENCODE_cCREs == "non-cCRE") %>% filter((K562_skew_pred_bin %in% emVar_bins) | (HepG2_skew_pred_bin %in% emVar_bins) | (SKNSH_skew_pred_bin %in% emVar_bins)) %>% .$count)/
  sum(gnomAD_main_table %>% filter(ENCODE_cCREs == "non-cCRE") %>% .$count)
## [1] 0.01294262
gnomAD_main_table_full_cCREs_emVar_cat_phyloP <- gnomAD_main_table_full_cCREs_emVar_cat %>% 
  mutate(phyloP_cons = count*phyloP_cons_prop, phyloP_noncons = count*(1-phyloP_cons_prop)) %>% 
  dplyr::select(ENCODE_cCREs, emVar_category, phyloP_cons, phyloP_noncons) %>% 
  pivot_longer(cols = c(phyloP_cons, phyloP_noncons), names_to = "phyloP", values_to = "count") %>% 
  mutate(phyloP = gsub("phyloP_", "", phyloP)) %>% 
  mutate(ENCODE_cCREs_phyloP = paste(ENCODE_cCREs, phyloP, sep="_")) %>% 
  group_by(ENCODE_cCREs_phyloP, ENCODE_cCREs, phyloP) %>% 
  mutate(count_prop = count/sum(count), base = sum(count)) %>% 
  ungroup() %>% 
  filter(emVar_category != "none") %>% 
  mutate(ENCODE_cCREs_phyloP = factor(ENCODE_cCREs_phyloP, levels = c("PLS_noncons", "PLS_cons", "pELS_noncons", "pELS_cons", "dELS_noncons", "dELS_cons", "non-cCRE_noncons", "non-cCRE_cons"))) %>% 
  mutate(emVar_category = factor(emVar_category, levels = c("K562", "HepG2", "SKNSH", "K562+HepG2", "K562+SKNSH", "SKNSH+HepG2", "K562+SKNSH+HepG2")))

write_tsv(gnomAD_main_table_full_cCREs_emVar_cat_phyloP, "../../results/gnomAD_selection_analysis/gnomAD_main_table_full_cCREs_emVar_cat_phyloP_stacked.txt")

gnomAD_main_table_full_cCREs_emVar_cat_phyloP %>% 
    group_by(ENCODE_cCREs, phyloP, base) %>% 
    summarise(
     count = sum(count), 
     count_prop = sum(count_prop)
    ) %>% 
    ungroup()
## `summarise()` has grouped output by 'ENCODE_cCREs', 'phyloP'. You can override
## using the `.groups` argument.
## # A tibble: 10 × 5
##    ENCODE_cCREs phyloP       base   count count_prop
##    <fct>        <chr>       <dbl>   <dbl>      <dbl>
##  1 PLS          cons       273419   48588     0.178 
##  2 PLS          noncons   1569996  114100     0.0727
##  3 pELS         cons       646903   50374     0.0779
##  4 pELS         noncons  10196308  400312     0.0393
##  5 dELS         cons      3684209  160162     0.0435
##  6 dELS         noncons  69919240 2505053     0.0358
##  7 non-cCRE     cons      5611880   63540     0.0113
##  8 non-cCRE     noncons 392563240 5089889     0.0130
##  9 Other cCREs  cons       839925   20759     0.0247
## 10 Other cCREs  noncons  28581137  844381     0.0295
gnomAD_main_table_full_cCREs_emVar_phyloP <- gnomAD_main_table_full_cCREs_emVar_cat %>% 
  mutate(phyloP_cons = count*phyloP_cons_prop, phyloP_noncons = count*(1-phyloP_cons_prop)) %>% 
  dplyr::select(ENCODE_cCREs, emVar_category, phyloP_cons, phyloP_noncons) %>% 
  pivot_longer(cols = c(phyloP_cons, phyloP_noncons), names_to = "phyloP", values_to = "count") %>% 
  mutate(phyloP = gsub("phyloP_", "", phyloP)) %>% 
  group_by(phyloP) %>% 
  mutate(count_prop = count/sum(count), base = sum(count)) %>% 
  ungroup() %>% 
  filter(emVar_category != "none") %>% 
  mutate(emVar_category = factor(emVar_category, levels = c("K562", "HepG2", "SKNSH", "K562+HepG2", "K562+SKNSH", "SKNSH+HepG2", "K562+SKNSH+HepG2")))

write_tsv(gnomAD_main_table_full_cCREs_emVar_phyloP, "../../results/gnomAD_selection_analysis/gnomAD_main_table_full_cCREs_emVar_phyloP_stacked.txt")

gnomAD_main_table_full_cCREs_emVar_phyloP %>% 
    group_by(phyloP, base) %>% 
    summarise(
     count = sum(count), 
     count_prop = sum(count_prop)
    ) %>% 
    ungroup()
## `summarise()` has grouped output by 'phyloP'. You can override using the
## `.groups` argument.
## # A tibble: 2 × 4
##   phyloP       base   count count_prop
##   <chr>       <dbl>   <dbl>      <dbl>
## 1 cons     11056336  343423     0.0311
## 2 noncons 502829921 8953735     0.0178
gnomAD_main_table_full_cCREs_emVar_cat_phyloP_cCRE <- gnomAD_main_table_full_cCREs_emVar_cat_phyloP %>% 
    group_by(ENCODE_cCREs, phyloP, base) %>% 
    summarise(
     count = sum(count), 
     count_prop = sum(count_prop)
    ) %>% 
    ungroup()
## `summarise()` has grouped output by 'ENCODE_cCREs', 'phyloP'. You can override
## using the `.groups` argument.
gnomAD_main_table_full_cCREs_emVar_phyloP_gw <- gnomAD_main_table_full_cCREs_emVar_phyloP %>%
    group_by(phyloP, base) %>%
    summarise(
     count = sum(count),
     count_prop = sum(count_prop)
    ) %>%
    ungroup() %>%
  mutate(ENCODE_cCREs = "Genome-wide")
## `summarise()` has grouped output by 'phyloP'. You can override using the
## `.groups` argument.
gnomAD_main_table_full_cCREs_emVar_phyloP_prop <- bind_rows(
  gnomAD_main_table_full_cCREs_emVar_phyloP_gw,
  gnomAD_main_table_full_cCREs_emVar_cat_phyloP_cCRE
) 
write_tsv(gnomAD_main_table_full_cCREs_emVar_phyloP_prop, "../../results/gnomAD_selection_analysis/gnomAD_main_table_full_cCREs_emVar_phyloP_prop.txt")

gnomAD_main_table_full_cCREs_emVar_phyloP_prop %>% 
  mutate(ENCODE_cCREs = factor(ENCODE_cCREs, levels = c("PLS", "pELS", "dELS", "non-cCRE", "Genome-wide"))) %>% 
  mutate(phyloP = ifelse(phyloP == "cons", TRUE, FALSE)) %>% 
  filter(ENCODE_cCREs != "Other cCREs") %>% 
  tidyplot(x = ENCODE_cCREs, y = count_prop, color = phyloP) %>% 
  add_mean_bar() %>% 
  adjust_x_axis(rotate_labels = TRUE) + 
  xlab("ENCODE cCREs") + 
  ylab("emVar proportion") + 
  labs(fill = "Constrained") + 
  theme(aspect.ratio = 1.6, strip.background = element_blank())

ggsave("../../results/gnomAD_selection_analysis/gnomAD_main_table_full_cCREs_emVar_phyloP_prop.pdf", scale = 0.7)
## Saving 4.9 x 3.5 in image
## Warning: 'mode(bg)' differs between new and previous
##   ==> NOT changing 'bg'
gnomAD_main_table_full_cCREs_emVar_phyloP_prop %>% 
  mutate(ENCODE_cCREs = factor(ENCODE_cCREs, levels = c("PLS", "pELS", "dELS", "non-cCRE", "Genome-wide"))) %>% 
  mutate(phyloP = ifelse(phyloP == "cons", TRUE, FALSE)) %>% 
  filter(ENCODE_cCREs != "Other cCREs") %>% 
  tidyplot(x = ENCODE_cCREs, y = count_prop, color = phyloP) %>% 
  add_mean_bar() %>% 
  adjust_x_axis(rotate_labels = TRUE) + 
  xlab("ENCODE cCREs") + 
  ylab("emVar proportion") + 
  labs(fill = "Constrained") + 
  theme(aspect.ratio = 1/1.01, strip.background = element_blank())

ggsave("../../results/gnomAD_selection_analysis/gnomAD_main_table_full_cCREs_emVar_phyloP_prop_alt.pdf")
## Saving 7 x 5 in image
## Warning: 'mode(bg)' differs between new and previous
##   ==> NOT changing 'bg'
gnomAD_main_table_full_cCREs_emVar_phyloP_prop %>% 
  mutate(ENCODE_cCREs = factor(ENCODE_cCREs, levels = c("PLS", "pELS", "dELS", "non-cCRE", "Genome-wide"))) %>% 
  mutate(phyloP = ifelse(phyloP == "cons", TRUE, FALSE)) %>% 
  filter(ENCODE_cCREs != "Other cCREs") %>% 
  tidyplot(x = ENCODE_cCREs, y = count, color = phyloP) %>% 
  add_mean_bar() %>% 
  adjust_x_axis(rotate_labels = TRUE) + 
  geom_text(aes(label = count), angle = 90, hjust = -0.1, size = 3, 
            position = position_dodge(.9), show.legend = FALSE) +
  xlab("ENCODE cCREs") + 
  ylab("emVar count") + 
  labs(fill = "Constrained") + 
    theme(aspect.ratio = 1.6, strip.background = element_blank())

ggsave("../../results/gnomAD_selection_analysis/gnomAD_main_table_full_cCREs_emVar_phyloP_count.pdf", scale = 0.7)
## Saving 4.9 x 3.5 in image
## Warning: 'mode(bg)' differs between new and previous
##   ==> NOT changing 'bg'
gnomAD_main_table_full_cCREs_emVar_phyloP_prop %>% 
  mutate(ENCODE_cCREs = factor(ENCODE_cCREs, levels = c("PLS", "pELS", "dELS", "non-cCRE", "Genome-wide"))) %>% 
  mutate(phyloP = ifelse(phyloP == "cons", TRUE, FALSE)) %>% 
  filter(ENCODE_cCREs != "Other cCREs") %>% 
  tidyplot(x = ENCODE_cCREs, y = count, color = phyloP) %>% 
  add_mean_bar() %>% 
  adjust_x_axis(rotate_labels = TRUE) + 
  geom_text(aes(label = count), angle = 90, hjust = -0.1, size = 3, 
            position = position_dodge(.9), show.legend = FALSE) +
  xlab("ENCODE cCREs") + 
  ylab("emVar count") + 
  labs(fill = "Constrained") + 
    theme(aspect.ratio = 1/1.01, strip.background = element_blank())

ggsave("../../results/gnomAD_selection_analysis/gnomAD_main_table_full_cCREs_emVar_phyloP_count_alt.pdf")
## Saving 7 x 5 in image
## Warning: 'mode(bg)' differs between new and previous
##   ==> NOT changing 'bg'
ggplot(gnomAD_main_table_full_cCREs, 
     aes(x="", y=count, fill=ENCODE_cCREs)) +   #   %>% filter(ENCODE_cCREs != "Other cCREs")) +
    geom_col() + 
    geom_text(aes(label = paste0(round(count/1e6, 1), "M")),
     position = position_stack(vjust = 0.5)) +
    coord_polar("y", start=0) +
    theme_void() + 
    scale_fill_manual(values = c(PLS="#3E0751", pELS="#405187", dELS="#468E8B", `non-cCRE`="#79C66E", `Other cCREs`="grey50")) + 
    labs(fill = "ENCODE cCREs")

ggsave("../../results/gnomAD_selection_analysis/gnomAD_main_table_full_cCREs_count.pdf", scale=0.6)
## Saving 4.2 x 3 in image
ggplot(gnomAD_main_table_full_cCREs_mean_ref %>% filter(ENCODE_cCREs != "Other cCREs")) + 
    geom_bar(aes(x = mean_ref_pred_bin, y = count, fill = ENCODE_cCREs), stat = "identity", position = "stack") + 
    cowplot::theme_cowplot() + 
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), axis.line.x = element_blank(), axis.line.y = element_blank()) + 
    scale_fill_manual(values = c("#3E0751", "#405187", "#468E8B", "#79C66E", "grey50")) + 
    labs(x = "Mean ref activity", y = "Variants", color = "ENCODE cCREs") + 
    facet_grid(rows = vars(ENCODE_cCREs), scales = "free") + 
    theme(aspect.ratio = 1/(1.6*4), strip.background = element_blank()) + 
    guides(fill = "none")

ggsave("../../results/gnomAD_selection_analysis/gnomAD_main_table_full_cCREs_mean_ref_count.pdf", scale=0.5)
## Saving 3.5 x 2.5 in image
ggplot(gnomAD_main_table_full_cCREs_mean %>% filter(ENCODE_cCREs != "Other cCREs")) + 
    geom_bar(aes(x = mean_skew_pred_bin, y = count, fill = ENCODE_cCREs), stat = "identity", position = "stack") + 
    cowplot::theme_cowplot() + 
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), axis.line.x = element_blank(), axis.line.y = element_blank()) + 
    scale_fill_manual(values = c("#3E0751", "#405187", "#468E8B", "#79C66E", "grey50")) + 
    labs(x = "Mean allelic skew", y = "Variants", color = "ENCODE cCREs") + 
    facet_grid(rows = vars(ENCODE_cCREs), scales = "free") +
    theme(aspect.ratio = 1/(1.6*4), strip.background = element_blank()) + 
    guides(fill = "none")

ggsave("../../results/gnomAD_selection_analysis/gnomAD_main_table_full_cCREs_mean_skew_count.pdf", scale=0.6)
## Saving 4.2 x 3 in image
gnomAD_main_table_full_cCREs_emVar_cat <- gnomAD_main_table_full_cCREs_emVar_cat %>% 
    group_by(ENCODE_cCREs) %>% 
    mutate(count_prop = count/sum(count)) %>% 
    ungroup()

ggplot(gnomAD_main_table_full_cCREs_emVar_cat %>% filter(ENCODE_cCREs != "Other cCREs", emVar_category != "none")) + 
    geom_bar(aes(x = emVar_category, y = count_prop, fill = ENCODE_cCREs), stat = "identity", position = "stack") + 
    cowplot::theme_cowplot() + 
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), axis.line.x = element_blank(), axis.line.y = element_blank()) + 
    scale_fill_manual(values = c("#3E0751", "#405187", "#468E8B", "#79C66E", "grey50")) + 
    labs(x = "emVars (Malinois)", y = "Proportion", color = "ENCODE cCREs") + 
    facet_grid(rows = vars(ENCODE_cCREs), scales = "free") +
    theme(aspect.ratio = 1/(1.6*4), strip.background = element_blank()) + 
    guides(fill = "none")

ggsave("../../results/gnomAD_selection_analysis/gnomAD_main_table_full_cCREs_emVar_cat_prop.pdf", scale=0.75)
## Saving 5.25 x 3.75 in image
gnomAD_cCREs_emVar_cat_ratio_shared_vs_specific <- gnomAD_main_table_full_cCREs_emVar_cat %>% filter(ENCODE_cCREs != "Other cCREs", emVar_category != "none") %>% 
  dplyr::select(ENCODE_cCREs, emVar_category, count) %>% 
  pivot_wider(names_from = emVar_category, values_from = count) %>% 
  mutate(shared_vs_specific = `K562+SKNSH+HepG2`/(`K562` + `HepG2` + `SKNSH`))

gnomAD_cCREs_emVar_cat_ratio_shared_vs_specific
## # A tibble: 4 × 9
##   ENCODE_cCREs    K562  HepG2  SKNSH `K562+HepG2` `K562+SKNSH` `SKNSH+HepG2`
##   <fct>          <int>  <int>  <int>        <int>        <int>         <int>
## 1 PLS            48804   5310  26471         8789        15250         11355
## 2 pELS          145101  28200  68595        24328        26640         44703
## 3 dELS          805978 218400 454219       140475       143428        296407
## 4 non-cCRE     1567945 375114 988059       218660       263041        644459
## # ℹ 2 more variables: `K562+SKNSH+HepG2` <int>, shared_vs_specific <dbl>
gnomAD_main_table_full_TF_footprints <- read_tsv("../../results/gnomAD_selection_analysis/gnomAD_TF_footprints_mean_allelic_skew.txt") %>% 
  mutate(mean_skew_pred_bin = factor(mean_skew_pred_bin, levels = 
    c("(-Inf, -1.5)", "[-1.5, -1.0)", "[-1.0, -0.5)", "[-0.5, -0.2)", 
    "[-0.2, -0.05)", "[-0.05, 0.05)", "[0.05, 0.2)", "[0.2, 0.5)", 
    "[0.5, 1.0)", "[1.0, 1.5)", "[1.5, Inf)"))
  ) %>% 
  group_by(mean_skew_pred_bin) %>%
  mutate(proportion = count/sum(count)) %>% 
  ungroup()
## Rows: 22 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): mean_skew_pred_bin
## dbl (1): count
## lgl (1): TF_footprints
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
ggplot(gnomAD_main_table_full_TF_footprints) + 
  geom_bar(aes(x = mean_skew_pred_bin, y = proportion, fill = TF_footprints), stat = "identity", position = "fill") + 
  geom_text(aes(x = mean_skew_pred_bin, 
                y = proportion + 0.15,
                label = ifelse(TF_footprints, sprintf("%.3f", proportion), "")), 
            size = 3, angle = 90, na.rm = TRUE, color = "white") + 
  cowplot::theme_cowplot() + 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), axis.line.x = element_blank()) + 
  labs(x = "Mean allelic skew", y = "Proportion", fill = "In TF ChIP-seq") + 
  theme(aspect.ratio = 1/1.6, strip.background = element_blank()) + 
  ylim(c(0,1)) + 
  guides(fill = "none")

ggsave("../../results/gnomAD_selection_analysis/gnomAD_main_table_full_TF_footprints_stack.pdf", scale=0.7)
## Saving 4.9 x 3.5 in image
gnomAD_main_table_full_TF_footprints_gw <- gnomAD_main_table_full_TF_footprints %>% 
  dplyr::select(TF_footprints, mean_skew_pred_bin, count) %>% 
  group_by(TF_footprints) %>% 
  summarise(count = sum(count)) %>% 
  ungroup() %>% 
  pivot_wider(names_prefix = "TF_footprints_", names_from = TF_footprints, values_from = count)

gnomAD_main_table_full_TF_footprints_temp <- gnomAD_main_table_full_TF_footprints %>% 
  dplyr::select(TF_footprints, mean_skew_pred_bin, count) %>% 
  pivot_wider(names_prefix = "TF_footprints_", names_from = TF_footprints, values_from = count)

gnomAD_main_table_full_TF_footprints_fet <- gnomAD_main_table_full_TF_footprints_temp %>% 
  rowwise() %>% 
  mutate(fet_TF_footprints = list(fisher.test(matrix(c(TF_footprints_TRUE+1, TF_footprints_FALSE+1, gnomAD_main_table_full_TF_footprints_gw[["TF_footprints_TRUE"]]-TF_footprints_TRUE+1, gnomAD_main_table_full_TF_footprints_gw[["TF_footprints_FALSE"]]-TF_footprints_FALSE+1), nrow=2)))) %>%
  mutate(fet_TF_footprints_pval = fet_TF_footprints$p.val) %>%
  mutate(fet_TF_footprints_odds = fet_TF_footprints$estimate) %>%
  mutate(fet_TF_footprints_lci = fet_TF_footprints$conf.int[1]) %>%
  mutate(fet_TF_footprints_uci = fet_TF_footprints$conf.int[2]) %>%
  ungroup() %>% 
  mutate(fet_TF_footprints_padj = p.adjust(fet_TF_footprints_pval, method = "fdr")) %>% 
  mutate(fet_TF_footprints_sig = ifelse(fet_TF_footprints_pval < 0.05, "*", ""))

ggplot(gnomAD_main_table_full_TF_footprints_fet %>% filter()) + 
  geom_bar(aes(x = mean_skew_pred_bin, y = (fet_TF_footprints_odds)), stat = "identity", fill = "#56BCC2") + 
  geom_errorbar(aes(x = mean_skew_pred_bin, ymin = (fet_TF_footprints_lci), ymax = (fet_TF_footprints_uci)), stat = "identity", width = 0.2) + 
  geom_text(aes(x = mean_skew_pred_bin, y = (fet_TF_footprints_uci) + 0.5, label = sprintf("%.2f", fet_TF_footprints_odds)), angle = 90, vjust = 0.5, hjust = 0) + 
  geom_hline(yintercept = 1, color = "black", linetype = "dashed", alpha = 0.5) + 
  cowplot::theme_cowplot() + 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), axis.line.x = element_blank()) + 
  labs(x = "Mean allelic skew", y = "Odds ratio") + 
  theme(aspect.ratio = 1/1.6, strip.background = element_blank()) + 
  ylim(c(0, 11))

ggsave("../../results/gnomAD_selection_analysis/gnomAD_main_table_full_TF_footprints_fet.pdf", scale=0.7)
## Saving 4.9 x 3.5 in image
write_tsv(gnomAD_main_table_full_TF_footprints, "../../results/gnomAD_selection_analysis/gnomAD_main_table_full_TF_footprints_fet.txt")
gnomAD_main_table_full_TF_ChIP_seq <- read_tsv("../../results/gnomAD_selection_analysis/gnomAD_TF_ChIP_seq_mean_allelic_skew.txt") %>% 
  mutate(mean_skew_pred_bin = factor(mean_skew_pred_bin, levels = 
    c("(-Inf, -1.5)", "[-1.5, -1.0)", "[-1.0, -0.5)", "[-0.5, -0.2)", 
    "[-0.2, -0.05)", "[-0.05, 0.05)", "[0.05, 0.2)", "[0.2, 0.5)", 
    "[0.5, 1.0)", "[1.0, 1.5)", "[1.5, Inf)"))
  ) %>% 
  group_by(mean_skew_pred_bin) %>%
  mutate(proportion = count/sum(count)) %>% 
  ungroup()
## Rows: 22 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): mean_skew_pred_bin
## dbl (1): count
## lgl (1): TF_ChIP_seq
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
ggplot(gnomAD_main_table_full_TF_ChIP_seq) + 
  geom_bar(aes(x = mean_skew_pred_bin, y = proportion, fill = TF_ChIP_seq), stat = "identity", position = "fill") + 
  geom_text(aes(x = mean_skew_pred_bin, 
                y = proportion + 0.15,
                label = ifelse(TF_ChIP_seq, sprintf("%.3f", proportion), "")), 
            size = 3, angle = 90, na.rm = TRUE, color = "white") + 
  cowplot::theme_cowplot() + 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), axis.line.x = element_blank()) + 
  labs(x = "Mean allelic skew", y = "Proportion", fill = "In TF ChIP-seq") + 
  theme(aspect.ratio = 1/1.6, strip.background = element_blank()) + 
  ylim(c(0,1)) + 
  guides(fill = "none")

ggsave("../../results/gnomAD_selection_analysis/gnomAD_main_table_full_TF_ChIP_seq_stack.pdf", scale=0.7)
## Saving 4.9 x 3.5 in image
gnomAD_main_table_full_TF_ChIP_seq_gw <- gnomAD_main_table_full_TF_ChIP_seq %>% 
  dplyr::select(TF_ChIP_seq, mean_skew_pred_bin, count) %>% 
  group_by(TF_ChIP_seq) %>% 
  summarise(count = sum(count)) %>% 
  ungroup() %>% 
  pivot_wider(names_prefix = "TF_ChIP_seq_", names_from = TF_ChIP_seq, values_from = count)

gnomAD_main_table_full_TF_ChIP_seq_temp <- gnomAD_main_table_full_TF_ChIP_seq %>% 
  dplyr::select(TF_ChIP_seq, mean_skew_pred_bin, count) %>% 
  pivot_wider(names_prefix = "TF_ChIP_seq_", names_from = TF_ChIP_seq, values_from = count)

gnomAD_main_table_full_TF_ChIP_seq_fet <- gnomAD_main_table_full_TF_ChIP_seq_temp %>% 
  rowwise() %>% 
  mutate(fet_TF_ChIP_seq = list(fisher.test(matrix(c(TF_ChIP_seq_TRUE+1, TF_ChIP_seq_FALSE+1, gnomAD_main_table_full_TF_ChIP_seq_gw[["TF_ChIP_seq_TRUE"]]-TF_ChIP_seq_TRUE+1, gnomAD_main_table_full_TF_ChIP_seq_gw[["TF_ChIP_seq_FALSE"]]-TF_ChIP_seq_FALSE+1), nrow=2)))) %>%
  mutate(fet_TF_ChIP_seq_pval = fet_TF_ChIP_seq$p.val) %>%
  mutate(fet_TF_ChIP_seq_odds = fet_TF_ChIP_seq$estimate) %>%
  mutate(fet_TF_ChIP_seq_lci = fet_TF_ChIP_seq$conf.int[1]) %>%
  mutate(fet_TF_ChIP_seq_uci = fet_TF_ChIP_seq$conf.int[2]) %>%
  ungroup() %>% 
  mutate(fet_TF_ChIP_seq_padj = p.adjust(fet_TF_ChIP_seq_pval, method = "fdr")) %>% 
  mutate(fet_TF_ChIP_seq_sig = ifelse(fet_TF_ChIP_seq_pval < 0.05, "*", ""))

ggplot(gnomAD_main_table_full_TF_ChIP_seq_fet %>% filter()) + 
  geom_bar(aes(x = mean_skew_pred_bin, y = (fet_TF_ChIP_seq_odds)), stat = "identity", fill = "#56BCC2") + 
  geom_errorbar(aes(x = mean_skew_pred_bin, ymin = (fet_TF_ChIP_seq_lci), ymax = (fet_TF_ChIP_seq_uci)), stat = "identity", width = 0.2) + 
  geom_text(aes(x = mean_skew_pred_bin, y = (fet_TF_ChIP_seq_uci) + 0.5, label = sprintf("%.2f", fet_TF_ChIP_seq_odds)), angle = 90, vjust = 0.5, hjust = 0) + 
  geom_hline(yintercept = 1, color = "black", linetype = "dashed", alpha = 0.5) + 
  cowplot::theme_cowplot() + 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), axis.line.x = element_blank()) + 
  labs(x = "Mean allelic skew", y = "Odds ratio") + 
  theme(aspect.ratio = 1/1.6, strip.background = element_blank()) + 
  ylim(c(0, 11))

ggsave("../../results/gnomAD_selection_analysis/gnomAD_main_table_full_TF_ChIP_seq_fet.pdf", scale=0.7)
## Saving 4.9 x 3.5 in image
write_tsv(gnomAD_main_table_full_TF_ChIP_seq, "../../results/gnomAD_selection_analysis/gnomAD_main_table_full_TF_ChIP_seq_fet.txt")
ggplot(gnomAD_main_table_full_cCREs_pleio %>% filter(ENCODE_cCREs != "Other cCREs") %>% mutate(pleio = as.factor(pleio))) + 
    geom_hline(yintercept = 0, color = "black") + 
    geom_bar(aes(pleio, phyloP_mean, fill = pleio), stat = "identity") + 
    geom_errorbar(aes(pleio, ymin = phyloP_lci, ymax = phyloP_uci), stat = "identity", width = 0.2) + 
    cowplot::theme_cowplot() + 
    theme(axis.line.x = element_blank()) + 
    scale_fill_viridis_d() + 
    scale_color_viridis_d() + 
    labs(x = "emVar pleiotropy", y = "Mean constraint (phyloP)") +
    guides(fill = "none") + 
    facet_wrap(~ENCODE_cCREs) + 
    theme(aspect.ratio = 1, strip.background = element_blank())

ggsave("../../results/gnomAD_selection_analysis/gnomAD_main_table_full_cCREs_pleio_phyloP.pdf", scale=1)
## Saving 7 x 5 in image
ggplot(gnomAD_main_table_full_cCREs_mean_ref %>% filter(ENCODE_cCREs != "Other cCREs")) + 
    geom_hline(yintercept = 0, color = "black") + 
    geom_ribbon(aes(x = mean_ref_pred_bin, ymin = phyloP_lci, ymax = phyloP_uci, fill = ENCODE_cCREs, group = ENCODE_cCREs), position = position_dodge(width = 0), alpha = 0.5) +
    geom_point(aes(x = mean_ref_pred_bin, y = phyloP_mean, color = ENCODE_cCREs), position = position_dodge(width = 0)) + 
    geom_errorbar(aes(x = mean_ref_pred_bin, ymin = phyloP_lci, ymax = phyloP_uci, color = ENCODE_cCREs), width = 0.5, position = position_dodge(width = 0)) + 
    cowplot::theme_cowplot() + 
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), axis.line.x = element_blank()) + 
    scale_fill_manual(values = c("#3E0751", "#405187", "#468E8B", "#79C66E", "grey50")) +   
    scale_color_manual(values = c("#3E0751", "#405187", "#468E8B", "#79C66E", "grey50")) + 
    labs(x = "Mean ref activitiy", y = "Mean constraint (phyloP)", color = "ENCODE cCREs") + 
    guides(fill = "none") + 
    theme(aspect.ratio = 1/0.99)

ggsave("../../results/gnomAD_selection_analysis/gnomAD_main_table_full_cCREs_mean_ref_phyloP.pdf")
## Saving 7 x 5 in image
ggplot(gnomAD_main_table_full_cCREs_mean %>% filter(ENCODE_cCREs != "Other cCREs")) + 
    geom_hline(yintercept = 0, color = "black") + 
    geom_ribbon(aes(x = mean_skew_pred_bin, ymin = phyloP_lci, ymax = phyloP_uci, fill = ENCODE_cCREs, group = ENCODE_cCREs), position = position_dodge(width = 0), alpha = 0.5) +
    geom_point(aes(x = mean_skew_pred_bin, y = phyloP_mean, color = ENCODE_cCREs), position = position_dodge(width = 0)) + 
    geom_errorbar(aes(x = mean_skew_pred_bin, ymin = phyloP_lci, ymax = phyloP_uci, color = ENCODE_cCREs), width = 0.5, position = position_dodge(width = 0)) + 
    cowplot::theme_cowplot() + 
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), axis.line.x = element_blank()) + 
    scale_fill_manual(values = c("#3E0751", "#405187", "#468E8B", "#79C66E", "grey50")) +   
    scale_color_manual(values = c("#3E0751", "#405187", "#468E8B", "#79C66E", "grey50")) + 
    labs(x = "Mean allelic skew", y = "Mean constraint (phyloP)", color = "ENCODE cCREs") + 
    guides(fill = "none") + 
    theme(aspect.ratio = 1/0.99)

ggsave("../../results/gnomAD_selection_analysis/gnomAD_main_table_full_cCREs_mean_phyloP.pdf")
## Saving 7 x 5 in image
ggplot(gnomAD_main_table_full_cCREs_mean %>% filter(ENCODE_cCREs != "Other cCREs")) + 
  geom_hline(yintercept = 0, color = "black") + 
  geom_ribbon(aes(x = mean_skew_pred_bin, ymin = phyloP_lci, ymax = phyloP_uci, fill = ENCODE_cCREs, group = ENCODE_cCREs), position = position_dodge(width = 0), alpha = 0.5) +
  geom_point(aes(x = mean_skew_pred_bin, y = phyloP_mean, color = ENCODE_cCREs), position = position_dodge(width = 0)) + 
  geom_errorbar(aes(x = mean_skew_pred_bin, ymin = phyloP_lci, ymax = phyloP_uci, color = ENCODE_cCREs), width = 0.5, position = position_dodge(width = 0)) + 
  cowplot::theme_cowplot() + 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), axis.line.x = element_blank()) + 
  scale_fill_manual(values = c("#3E0751", "#405187", "#468E8B", "#79C66E", "grey50")) + 
  scale_color_manual(values = c("#3E0751", "#405187", "#468E8B", "#79C66E", "grey50")) + 
  labs(x = "Mean allelic skew", y = "Mean constraint (phyloP)", color = "ENCODE cCREs") + 
  guides(fill = "none") + 
  theme(aspect.ratio = 3/2)

ggsave("../../results/gnomAD_selection_analysis/gnomAD_main_table_full_cCREs_mean_phyloP_skinny.pdf", scale = 1)
## Saving 7 x 5 in image
ggplot(gnomAD_main_table_full_cCREs_K562 %>% filter(ENCODE_cCREs != "Other cCREs")) + 
    geom_hline(yintercept = 0, color = "black") + 
    geom_ribbon(aes(x = K562_skew_pred_bin, ymin = phyloP_lci, ymax = phyloP_uci, fill = ENCODE_cCREs, group = ENCODE_cCREs), position = position_dodge(width = 0), alpha = 0.5) +
    geom_point(aes(x = K562_skew_pred_bin, y = phyloP_mean, color = ENCODE_cCREs), position = position_dodge(width = 0)) + 
    geom_errorbar(aes(x = K562_skew_pred_bin, ymin = phyloP_lci, ymax = phyloP_uci, color = ENCODE_cCREs), width = 0.5, position = position_dodge(width = 0)) + 
    cowplot::theme_cowplot() + 
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), axis.line.x = element_blank()) + 
    scale_fill_manual(values = c("#3E0751", "#405187", "#468E8B", "#79C66E", "grey50")) +   
    scale_color_manual(values = c("#3E0751", "#405187", "#468E8B", "#79C66E", "grey50")) + 
    labs(x = "K562 allelic skew", y = "Mean constraint (phyloP)", color = "ENCODE cCREs") + 
    guides(fill = "none") + 
    theme(aspect.ratio = 1/0.99)

ggsave("../../results/gnomAD_selection_analysis/gnomAD_main_table_full_cCREs_K562_phyloP.pdf")
## Saving 7 x 5 in image
ggplot(gnomAD_main_table_full_cCREs_HepG2 %>% filter(ENCODE_cCREs != "Other cCREs")) + 
    geom_hline(yintercept = 0, color = "black") + 
    geom_ribbon(aes(x = HepG2_skew_pred_bin, ymin = phyloP_lci, ymax = phyloP_uci, fill = ENCODE_cCREs, group = ENCODE_cCREs), position = position_dodge(width = 0), alpha = 0.5) +
    geom_point(aes(x = HepG2_skew_pred_bin, y = phyloP_mean, color = ENCODE_cCREs), position = position_dodge(width = 0)) + 
    geom_errorbar(aes(x = HepG2_skew_pred_bin, ymin = phyloP_lci, ymax = phyloP_uci, color = ENCODE_cCREs), width = 0.5, position = position_dodge(width = 0)) + 
    cowplot::theme_cowplot() + 
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), axis.line.x = element_blank()) + 
    scale_fill_manual(values = c("#3E0751", "#405187", "#468E8B", "#79C66E", "grey50")) +   
    scale_color_manual(values = c("#3E0751", "#405187", "#468E8B", "#79C66E", "grey50")) + 
    labs(x = "HepG2 allelic skew", y = "Mean constraint (phyloP)", color = "ENCODE cCREs") + 
    guides(fill = "none") + 
    theme(aspect.ratio = 1/0.99)

ggsave("../../results/gnomAD_selection_analysis/gnomAD_main_table_full_cCREs_HepG2_phyloP.pdf")
## Saving 7 x 5 in image
ggplot(gnomAD_main_table_full_cCREs_SKNSH %>% filter(ENCODE_cCREs != "Other cCREs")) + 
    geom_hline(yintercept = 0, color = "black") + 
    geom_ribbon(aes(x = SKNSH_skew_pred_bin, ymin = phyloP_lci, ymax = phyloP_uci, fill = ENCODE_cCREs, group = ENCODE_cCREs), position = position_dodge(width = 0), alpha = 0.5) +
    geom_point(aes(x = SKNSH_skew_pred_bin, y = phyloP_mean, color = ENCODE_cCREs), position = position_dodge(width = 0)) + 
    geom_errorbar(aes(x = SKNSH_skew_pred_bin, ymin = phyloP_lci, ymax = phyloP_uci, color = ENCODE_cCREs), width = 0.5, position = position_dodge(width = 0)) + 
    cowplot::theme_cowplot() + 
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), axis.line.x = element_blank()) + 
    scale_fill_manual(values = c("#3E0751", "#405187", "#468E8B", "#79C66E", "grey50")) +   
    scale_color_manual(values = c("#3E0751", "#405187", "#468E8B", "#79C66E", "grey50")) + 
    labs(x = "SKNSH allelic skew", y = "Mean constraint (phyloP)", color = "ENCODE cCREs") + 
    guides(fill = "none") + 
    theme(aspect.ratio = 1/0.99)

ggsave("../../results/gnomAD_selection_analysis/gnomAD_main_table_full_cCREs_SKNSH_phyloP.pdf")
## Saving 7 x 5 in image
ggplot(gnomAD_main_table_full_cCREs_mean_category %>% filter(ENCODE_cCREs != "Other cCREs")) +
  geom_tile(aes(x = mean_skew_pred_bin, y = category_copy, fill = phyloP_mean)) +
  geom_text(aes(x = mean_skew_pred_bin, y = category_copy, 
    label = round(phyloP_mean, 2)),
    color = "white", size = 2) +  
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
    axis.line = element_blank()) + 
  scale_fill_viridis_c() + 
  labs(x = "Mean allelic skew", y = "ENCODE cCREs", fill = "Mean constraint (phyloP)") +
  facet_grid(rows = vars(ENCODE_cCREs)) +
  coord_fixed() +  # Ensures square tiles
  theme(strip.background = element_blank())
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_text()`).

ggsave("../../results/gnomAD_selection_analysis/gnomAD_main_table_full_cCREs_mean_category_count_heat.pdf")
## Saving 7 x 5 in image
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_text()`).
ggplot(gnomAD_main_table_full_cCREs_category %>% filter(ENCODE_cCREs != "Other cCREs")) +
  geom_tile(aes(x = 1, y = category_copy, fill = phyloP_mean)) +
  geom_text(aes(x = 1, y = category_copy, 
    label = round(phyloP_mean, 2)),
    color = "white", size = 2) +  
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
    axis.line = element_blank()) + 
  scale_fill_viridis_c(limits = c(
    min(gnomAD_main_table_full_cCREs_mean_category$phyloP_mean, na.rm = T), 
    max(gnomAD_main_table_full_cCREs_mean_category$phyloP_mean, na.rm = T))) + 
  labs(x = "", y = "ENCODE cCREs", fill = "Mean constraint (phyloP)") +
  facet_grid(rows = vars(ENCODE_cCREs)) +
  coord_fixed() +  # Ensures square tiles
  theme(strip.background = element_blank())

ggsave("../../results/gnomAD_selection_analysis/gnomAD_main_table_full_cCREs_category_count_heat.pdf")
## Saving 7 x 5 in image
gnomAD_main_table_full_rare_common_phyloP_mean <- gnomAD_main_table_full_cCREs_category %>% 
  mutate(category_copy_temp = ifelse(category_copy %in% c("SINGLETON", "ULTRARARE", "RARE"), "RARE", "COMMON")) %>% 
  group_by(category_copy_temp) %>% 
  summarise(
    sum_count = sum(count), 
    sum_phyloP = sum(count * phyloP_mean)
  ) %>% 
  ungroup()

filter(gnomAD_main_table_full_rare_common_phyloP_mean, category_copy_temp == "RARE")$sum_phyloP/
  filter(gnomAD_main_table_full_rare_common_phyloP_mean, category_copy_temp == "RARE")$sum_count
## [1] -0.07896777
filter(gnomAD_main_table_full_rare_common_phyloP_mean, category_copy_temp == "COMMON")$sum_phyloP/
  filter(gnomAD_main_table_full_rare_common_phyloP_mean, category_copy_temp == "COMMON")$sum_count
## [1] -0.3016857
gnomAD_main_table_full_cCREs_mean %>% filter(ENCODE_cCREs == "PLS", mean_skew_pred_bin == "(-Inf, -1.5)") %>% dplyr::select(ENCODE_cCREs, mean_skew_pred_bin, fet_RARE_COMMON_odds, fet_RARE_COMMON_lci, fet_RARE_COMMON_uci)
## # A tibble: 1 × 5
##   ENCODE_cCREs mean_skew_pred_bin fet_RARE_COMMON_odds fet_RARE_COMMON_lci
##   <fct>        <fct>                             <dbl>               <dbl>
## 1 PLS          (-Inf, -1.5)                       1.84                1.52
## # ℹ 1 more variable: fet_RARE_COMMON_uci <dbl>
gnomAD_main_table_full_cCREs_mean %>% filter(ENCODE_cCREs == "PLS", mean_skew_pred_bin == "[1.0, 1.5)") %>% dplyr::select(ENCODE_cCREs, mean_skew_pred_bin, fet_RARE_COMMON_odds, fet_RARE_COMMON_lci, fet_RARE_COMMON_uci)
## # A tibble: 1 × 5
##   ENCODE_cCREs mean_skew_pred_bin fet_RARE_COMMON_odds fet_RARE_COMMON_lci
##   <fct>        <fct>                             <dbl>               <dbl>
## 1 PLS          [1.0, 1.5)                         1.43                1.20
## # ℹ 1 more variable: fet_RARE_COMMON_uci <dbl>
gnomAD_main_table_full_cCREs_mean %>% filter(ENCODE_cCREs == "PLS", mean_skew_pred_bin == "[1.5, Inf)") %>% dplyr::select(ENCODE_cCREs, mean_skew_pred_bin, fet_RARE_COMMON_odds, fet_RARE_COMMON_lci, fet_RARE_COMMON_uci)
## # A tibble: 1 × 5
##   ENCODE_cCREs mean_skew_pred_bin fet_RARE_COMMON_odds fet_RARE_COMMON_lci
##   <fct>        <fct>                             <dbl>               <dbl>
## 1 PLS          [1.5, Inf)                         1.20               0.850
## # ℹ 1 more variable: fet_RARE_COMMON_uci <dbl>
gnomAD_main_table_full_cCREs_mean %>% filter(ENCODE_cCREs == "PLS", mean_skew_pred_bin == "(-Inf, -1.5)") %>% dplyr::select(ENCODE_cCREs, mean_skew_pred_bin, phyloP_mean, phyloP_lci, phyloP_uci) 
## # A tibble: 1 × 5
##   ENCODE_cCREs mean_skew_pred_bin phyloP_mean phyloP_lci phyloP_uci
##   <fct>        <fct>                    <dbl>      <dbl>      <dbl>
## 1 PLS          (-Inf, -1.5)              3.25       3.14       3.36
gnomAD_main_table_full_cCREs_mean %>% filter(ENCODE_cCREs == "pELS", mean_skew_pred_bin == "(-Inf, -1.5)") %>% dplyr::select(ENCODE_cCREs, mean_skew_pred_bin, phyloP_mean, phyloP_lci, phyloP_uci) 
## # A tibble: 1 × 5
##   ENCODE_cCREs mean_skew_pred_bin phyloP_mean phyloP_lci phyloP_uci
##   <fct>        <fct>                    <dbl>      <dbl>      <dbl>
## 1 pELS         (-Inf, -1.5)             0.868      0.798      0.938
gnomAD_main_table_full_cCREs_mean %>% filter(ENCODE_cCREs == "dELS", mean_skew_pred_bin == "(-Inf, -1.5)") %>% dplyr::select(ENCODE_cCREs, mean_skew_pred_bin, phyloP_mean, phyloP_lci, phyloP_uci) 
## # A tibble: 1 × 5
##   ENCODE_cCREs mean_skew_pred_bin phyloP_mean phyloP_lci phyloP_uci
##   <fct>        <fct>                    <dbl>      <dbl>      <dbl>
## 1 dELS         (-Inf, -1.5)           -0.0525    -0.0717    -0.0333
gnomAD_main_table_full_cCREs_mean %>% filter(ENCODE_cCREs == "PLS", mean_skew_pred_bin == "[1.5, Inf)") %>% dplyr::select(ENCODE_cCREs, mean_skew_pred_bin, phyloP_mean, phyloP_lci, phyloP_uci) 
## # A tibble: 1 × 5
##   ENCODE_cCREs mean_skew_pred_bin phyloP_mean phyloP_lci phyloP_uci
##   <fct>        <fct>                    <dbl>      <dbl>      <dbl>
## 1 PLS          [1.5, Inf)               0.657      0.492      0.821
gnomAD_main_table_full_cCREs_mean %>% filter(ENCODE_cCREs == "pELS", mean_skew_pred_bin == "[1.5, Inf)") %>% dplyr::select(ENCODE_cCREs, mean_skew_pred_bin, phyloP_mean, phyloP_lci, phyloP_uci) 
## # A tibble: 1 × 5
##   ENCODE_cCREs mean_skew_pred_bin phyloP_mean phyloP_lci phyloP_uci
##   <fct>        <fct>                    <dbl>      <dbl>      <dbl>
## 1 pELS         [1.5, Inf)               0.184      0.130      0.237
gnomAD_main_table_full_cCREs_mean %>% filter(ENCODE_cCREs == "dELS", mean_skew_pred_bin == "[1.5, Inf)") %>% dplyr::select(ENCODE_cCREs, mean_skew_pred_bin, phyloP_mean, phyloP_lci, phyloP_uci) 
## # A tibble: 1 × 5
##   ENCODE_cCREs mean_skew_pred_bin phyloP_mean phyloP_lci phyloP_uci
##   <fct>        <fct>                    <dbl>      <dbl>      <dbl>
## 1 dELS         [1.5, Inf)              -0.152     -0.173     -0.131